{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54d3f668",
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install beautifulsoup4\n",
    "pip install lxml"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bdffea2b",
   "metadata": {},
   "source": [
    "## parser\n",
    "- html.parser\n",
    "- lxml\n",
    "- xml\n",
    "- html5lib 浏览器的方式解析文档（容错高 速度慢）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a391dde2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "\n",
    "html = '''\n",
    "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\"><meta content=\"always\" name=\"referrer\"><meta name=\"theme-color\" content=\"#ffffff\"><meta name=\"description\" content=\"全球领先的中文搜索引擎、致力于让网民更便捷地获取信息，找到所求。百度超过千亿的中文网页数据库，可以瞬间找到相关的搜索结果。\"><link rel=\"shortcut icon\" href=\"/favicon.ico\" type=\"image/x-icon\" /><link rel=\"search\" type=\"application/opensearchdescription+xml\" href=\"/content-search.xml\" title=\"百度搜索\" /><link rel=\"icon\" sizes=\"any\" mask href=\"//www.baidu.com/img/baidu_85beaf5496f291521eb75ba38eacbd87.svg\"><link rel=\"dns-prefetch\" href=\"//dss0.bdstatic.com\"/><link rel=\"dns-prefetch\" href=\"//dss1.bdstatic.com\"/><link rel=\"dns-prefetch\" href=\"//ss1.bdstatic.com\"/><link rel=\"dns-prefetch\" href=\"//sp0.baidu.com\"/><link rel=\"dns-prefetch\" href=\"//sp1.baidu.com\"/><link rel=\"dns-prefetch\" href=\"//sp2.baidu.com\"/><title>百度一下，你就知道</title>\n",
    "<style index=\"newi\" type=\"text/css\">#form .bdsug{top:39px}.bdsug{display:none;position:absolute;width:535px;background:#fff;border:1px solid #ccc!important;_overflow:hidden;box-shadow:1px 1px 3px #ededed;-webkit-box-shadow:1px 1px 3px #ededed;-moz-box-shadow:1px 1px 3px #ededed;-o-box-shadow:1px 1px 3px #ededed}.bdsug li{width:519px;color:#000;font:14px arial;line-height:25px;padding:0 8px;position:relative;cursor:default}.bdsug li.bdsug-s{background:#f0f0f0}.bdsug-store span,.bdsug-store b{color:#7A77C8}.bdsug-store-del{font-size:12px;color:#666;text-decoration:underline;position:absolute;right:8px;top:0;cursor:pointer;display:none}.bdsug-s .bdsug-store-del{display:inline-block}.bdsug-ala{display:inline-block;border-bottom:1px solid #e6e6e6}.bdsug-ala h3{line-height:14px;background:url(//www.baidu.com/img/sug_bd.png?v=09816787.png) no-repeat left center;margin:6px 0 4px;font-size:12px;font-weight:400;color:#7B7B7B;padding-left:20px}.bdsug-ala p{font-size:14px;font-weight:700;padding-left:20px}#m .bdsug .bdsug-direct p{color:#00c;font-weight:700;line-height:34px;padding:0 8px;margin-top:0;cursor:pointer;white-space:nowrap;overflow:hidden}#m .bdsug .bdsug-direct p img{width:16px;height:16px;margin:7px 6px 9px 0;vertical-align:middle}#m .bdsug .bdsug-direct p span{margin-left:8px}#form .bdsug .bdsug-direct{width:auto;padding:0;border-bottom:1px solid #f1f1f1}#form .bdsug .bdsug-direct p i{font-size:12px;line-height:100%;font-style:normal;font-weight:400;color:#fff;background-color:#2b99ff;display:inline;text-align:center;padding:1px 5px;*padding:2px 5px 0;margin-left:8px;overflow:hidden}.bdsug .bdsug-pcDirect{color:#000;font-size:14px;line-height:30px;height:30px;background-color:#f8f8f8}.bdsug .bdsug-pc-direct-tip{position:absolute;right:15px;top:8px;width:55px;height:15px;display:block;background:url(https://pss.bdstatic.com/r/www/cache/static/protocol/https/global/img/pc_direct_42d6311.png) no-repeat 0 0}.bdsug li.bdsug-pcDirect-s{background-color:#f0f0f0}.bdsug .bdsug-pcDirect-is{color:#000;font-size:14px;line-height:22px;background-color:#f5f5f5}.bdsug .bdsug-pc-direct-tip-is{position:absolute;right:15px;top:3px;width:55px;height:15px;display:block;background:url(https://pss.bdstatic.com/r/www/cache/static/protocol/https/global/img/pc_direct_42d6311.png) no-repeat 0 0}.bdsug li.bdsug-pcDirect-is-s{background-color:#f0f0f0}.bdsug .bdsug-pcDirect-s .bdsug-pc-direct-tip,.bdsug .bdsug-pcDirect-is-s .bdsug-pc-direct-tip-is{background-position:0 -15px}.bdsug .bdsug-newicon{color:#929292;opacity:.7;font-size:12px;display:inline-block;line-height:22px;letter-spacing:2px}.bdsug .bdsug-s .bdsug-newicon{opacity:1}.bdsug .bdsug-newicon i{letter-spacing:0;font-style:normal}.bdsug .bdsug-feedback-wrap{display:none}.toggle-underline{text-decoration:none}.toggle-underline:hover{text-decoration:underline}.bdpfmenu,.usermenu{border:1px solid #d1d1d1;position:absolute;width:105px;top:36px;z-index:302;box-shadow:1px 1px 5px #d1d1d1;-webkit-box-shadow:1px 1px 5px #d1d1d1;-moz-box-shadow:1px 1px 5px #d1d1d1;-o-box-shadow:1px 1px 5px #d1d1d1}.bdpfmenu{font-size:12px;background-color:#fff}.bdpfmenu a,.usermenu a{display:block;text-align:left;margin:0!important;padding:0 9px;line-height:26px;text-decoration:none}.briiconsbg{background-repeat:no-repeat;background-size:300px 18px;background-image:url(https://pss.bdstatic.com/r/www/cache/static/protocol/https/home/img/icons_0c37e9b.png);background-image:url(https://pss.bdstatic.com/r/www/cache/static/protocol/https/home/img/icons_809ae65.gif)\\9}.bdpfmenu a:link,.bdpfmenu a:visited,#u .usermenu a:link,#u .usermenu a:visited{background:#fff;color:#333}.bdpfmenu a:hover,.bdpfmenu a:active,#u .usermenu a:hover,#u .usermenu a:active{background:#38f;text-decoration:none;color:#fff}.bdpfmenu{width:70px}#wrapper .bdnuarrow{width:0;height:0;font-size:0;line-height:0;display:block;position:absolute;top:-10px;left:50%;margin-left:-5px}#wrapper .bdnuarrow em,#wrapper .bdnuarrow i{width:0;height:0;font-size:0;line-height:0;display:block;position:absolute;border:5px solid transparent;border-style:dashed dashed solid}#wrapper .bdnuarrow em{border-bottom-color:#d8d8d8;top:-1px}#wrapper .bdnuarrow i{border-bottom-color:#fff;top:0}#gxszHead .prefpanelclose{cursor:pointer;width:16px;height:16px;float:right;margin-top:7px;background-position:-248px 0}#gxszHead .prefpanelclose:hover{background-position:-264px 0}.s_ipt::-webkit-input-placeholder{padding-left:3px;color:#aaa;font-size:13px}.s_ipt::-moz-placeholder{padding-left:3px;color:#aaa;font-size:13px}.s_ipt:-ms-input-placeholder{padding-left:3px;color:#aaa;font-size:13px}.s_ipt::placeholder{padding-left:3px;color:#aaa;font-size:13px}.kw-placeholder{position:absolute;top:0;left:0;color:#aaa;font-size:13px;height:40px;line-height:40px;padding-left:10px;max-width:360px;z-index:99;pointer-events:none}.kw-placeholder.kw-placehlder-high{height:40px;line-height:40px}.kw-placeholder.placeholders-hidden{visibility:hidden}#head_wrapper #form .bdsug-new{width:544px;top:35px;border-radius:0 0 10px 10px;border:2px solid #4E6EF2!important;border-top:0!important;box-shadow:none;font-family:Arial,\"PingFang SC\",\"Microsoft YaHei\",sans-serif;z-index:1}#head_wrapper.sam_head_wrapper2 #form .bdsug-new{width:545px;z-index:1;border:1px solid #4E6EF2!important;border-top:0!important}#head_wrapper #form .bdsug-new ul{margin:7px 14px 0;padding:8px 0 7px;background:0 0;border-top:2px solid #f5f5f6}#head_wrapper #form .bdsug-new ul li{width:auto;padding-left:14px;margin-left:-14px;margin-right:-14px;color:#626675;line-height:28px;background:0 0;font-family:Arial,\"PingFang SC\",\"Microsoft YaHei\",sans-serif}#head_wrapper #form .bdsug-new ul li span{color:#626675}#head_wrapper #form .bdsug-new ul li b{font-weight:400;color:#222}#head_wrapper #form .bdsug-new .bdsug-store-del{font-size:13px;text-decoration:none;color:#9195A3;right:16px}#head_wrapper #form .bdsug-new .bdsug-store-del:hover{color:#315EFB;cursor:pointer}#head_wrapper #form .bdsug-new ul li:hover,#head_wrapper #form .bdsug-new ul li:hover span,#head_wrapper #form .bdsug-new ul li:hover b{cursor:pointer}.wrapper_new #form .bdsug-new .bdsug-s{background-color:#F5F5F6!important}.wrapper_new #form .sam_search .bdsug-new .bdsug-s{background-color:#F1F3FD!important}#head_wrapper #form .sam_search .bdsug-new .bdsug-s{background-color:#F1F3FD!important}#head .s-down #form .bdsug-new{top:32px}.s-skin-hasbg #head_wrapper #form .bdsug-new{border-color:#4569ff!important;border-top:0!important}.s-skin-hasbg #head_wrapper.s-down #form .bdsug-new{border-color:#4e6ef2!important;border-top:0!important}.s-skin-hasbg #head_wrapper.s-down #form.sam_search .bdsug-new{border-color:rgba(0,0,0,.05)!important;border-top:1px solid rgba(0,0,0,.05)!important;top:54px!important}#head_wrapper #form .bdsug-new .bdsug-s,#head_wrapper #form .bdsug-new .bdsug-s span,#head_wrapper #form .bdsug-new .bdsug-s b{color:#315EFB}#head_wrapper #form .bdsug-new>div span:hover,#head_wrapper #form .bdsug-new>div a:hover{color:#315EFB!important}#head_wrapper #form #kw.new-ipt-focus{border-color:#4e6ef2}#head_wrapper.s-down #form .sam-bdsug.bdsug-new{top:52px}#head_wrapper #form .sam-bdsug.bdsug-new{width:100%;box-shadow:0 4px 4px 0 rgba(0,0,0,.1);border:1px solid rgba(0,0,0,.05)!important;border-radius:12px;top:56px}#head_wrapper #form .sam-bdsug.bdsug-new ul{border:0;padding:0 0 7px}#head_wrapper #form .sam-bdsug.bdsug-new ul li{line-height:32px}#head_wrapper #form .sam-bdsug.bdsug-new ul .bdsug-s{background-color:#F1F3FD!important}#head_wrapper #form .sam-bdsug.bdsug-new .bdsug-store-del{right:15px}.sam_search .sam_search_rec,.sam_search .sam_search_soutu{z-index:1;display:none;position:absolute;top:50%;margin-top:-12px;font-size:24px;color:#4E6EF2;height:24px;line-height:24px;width:24px;cursor:pointer;-webkit-transform:translate3d(0,0,0);transform:translate3d(0,0,0);transition:transform .3s ease}.sam_search .sam_search_rec{right:54px}.sam_search .sam_search_soutu{right:14px}.sam_search .sam_search_rec:hover,.sam_search .sam_search_soutu:hover{color:#1D4FFF!important;transform:scale(1.08,1.08)}.sam_search .sam_search_rec_hover,.sam_search .sam_search_soutu_hover{background:#626675;border-radius:8px;height:32px;width:76px;text-align:center;line-height:32px;font-size:13px;color:#FFF;position:absolute;z-index:2;top:50px}.sam_search .sam_search_rec_hover:before,.sam_search .sam_search_soutu_hover:before{content:'';border:4px solid transparent;border-bottom:4px solid #626675;position:absolute;left:50%;top:-8px;margin-left:-4px}.sam_search .sam_search_rec_hover{right:29px}.sam_search .sam_search_soutu_hover{display:none;right:-12px}</style>\n",
    "</head>\n",
    "<body>\n",
    "    <div>asb</div>\n",
    "</body>\n",
    "</html>\n",
    "'''\n",
    "soup = BeautifulSoup(html, 'html.parser')\n",
    "print(soup.prettify())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ba819dc",
   "metadata": {},
   "source": [
    "# 标签选择器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ef215ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "\n",
    "resp = requests.get('http://www.baidu.com')\n",
    "resp.encoding = 'utf-8'\n",
    "soup = BeautifulSoup(resp.text, 'html.parser')\n",
    "\n",
    "print(soup.title) # 标签对象\n",
    "print('------------')\n",
    "print(soup.title.name) # 标签的名称\n",
    "print('------------')\n",
    "print(type(soup.title)) # 标签类型\n",
    "print('------------')\n",
    "print(soup.head)\n",
    "print('------------')\n",
    "print(soup.p) # 第一个p标签\n",
    "print('------------')\n",
    "print(soup.p.attrs['id']) # 获取标签上的属性\n",
    "print('------------')\n",
    "print(soup.p['id']) # 获取标签上的属性 同 attrs\n",
    "print('------------')\n",
    "print(soup.a.string) # 获取标签里的文字内容\n",
    "print('------------')\n",
    "print(soup.head.title.string) # 嵌套的标签通过点获取\n",
    "print('------------')\n",
    "print(soup.head.contents) # 获取子节点， 通过列表返回\n",
    "print('childrens：')\n",
    "childrens = soup.head.children # 获取子节点，返回一个迭代器\n",
    "print(type(childrens))\n",
    "for i, child in enumerate(soup.head.children):\n",
    "    print(child)\n",
    "print('descendants：')\n",
    "descendants = soup.head.descendants # 获取子节点和孙子节点，返回一个迭代器\n",
    "print(type(descendants))\n",
    "for i, child in enumerate(descendants):\n",
    "    print(child)\n",
    "\n",
    "print('------------')\n",
    "print(soup.head.title.parent) # 获取父节点  .parents所以祖先节点\n",
    "print('------------')\n",
    "siblings = soup.head.next_siblings  # previous_siblings\n",
    "print(type(siblings))\n",
    "# print(list(enumerate(siblings)))\n",
    "for i, sib in enumerate(siblings):\n",
    "    print(sib)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0536f7f8",
   "metadata": {},
   "source": [
    "## 标准选择器  find_all(name, attrs, recursive, text, **kwargs) find(name, attrs, recursive, text, **kwargs)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "494c3a01",
   "metadata": {},
   "source": [
    "find_all返回的标签可以继续在这个标签上调用find_all()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6520a338",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "\n",
    "resp = requests.get('http://www.baidu.com')\n",
    "resp.encoding = 'utf-8'\n",
    "soup = BeautifulSoup(resp.text, 'html.parser')\n",
    "print(soup.find_all('a'))\n",
    "print(soup.find_all('a')[0])\n",
    "print(soup.find_all('a', attrs={ 'class':'mnav' }))\n",
    "print(soup.find_all('a', text='贴吧'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cb8fc686",
   "metadata": {},
   "source": [
    "## find_parents() find_parent() find_next_siblings() find_next_sibling()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8e5d3eab",
   "metadata": {},
   "source": [
    "## find_all_next() find_next()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b3de44f",
   "metadata": {},
   "source": [
    "## find_all_previous() find_previous()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2fb65e55",
   "metadata": {},
   "source": [
    "# css选择器"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed9e252d",
   "metadata": {},
   "source": [
    "## select"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2e92e15",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "\n",
    "resp = requests.get('http://www.baidu.com')\n",
    "resp.encoding = 'utf-8'\n",
    "soup = BeautifulSoup(resp.text, 'html.parser')\n",
    "print(soup.select('.mnav')[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df07a0c4",
   "metadata": {},
   "source": [
    "## 选择器获取属性"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f78f785",
   "metadata": {},
   "source": [
    "- findedItem.attrs['id'] 获取属性\n",
    "- findedItem.get_text() 获取文本内容"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
